R Markdown
setwd("/Users/kenyaroy/doing data science/unit 8")
Beers = read.csv("/Users/kenyaroy/doing data science/unit 8/Beers.csv")
Breweries = read.csv("/Users/kenyaroy/doing data science/unit 8/Breweries.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(naniar)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(usmap)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 1.0.1
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
#How many breweries are present in each state
Breweries %>% group_by(State) %>% dplyr::summarize(n()) %>% print(n=51)
## # A tibble: 51 × 2
## State `n()`
## <chr> <int>
## 1 " AK" 7
## 2 " AL" 3
## 3 " AR" 2
## 4 " AZ" 11
## 5 " CA" 39
## 6 " CO" 47
## 7 " CT" 8
## 8 " DC" 1
## 9 " DE" 2
## 10 " FL" 15
## 11 " GA" 7
## 12 " HI" 4
## 13 " IA" 5
## 14 " ID" 5
## 15 " IL" 18
## 16 " IN" 22
## 17 " KS" 3
## 18 " KY" 4
## 19 " LA" 5
## 20 " MA" 23
## 21 " MD" 7
## 22 " ME" 9
## 23 " MI" 32
## 24 " MN" 12
## 25 " MO" 9
## 26 " MS" 2
## 27 " MT" 9
## 28 " NC" 19
## 29 " ND" 1
## 30 " NE" 5
## 31 " NH" 3
## 32 " NJ" 3
## 33 " NM" 4
## 34 " NV" 2
## 35 " NY" 16
## 36 " OH" 15
## 37 " OK" 6
## 38 " OR" 29
## 39 " PA" 25
## 40 " RI" 5
## 41 " SC" 4
## 42 " SD" 1
## 43 " TN" 3
## 44 " TX" 28
## 45 " UT" 4
## 46 " VA" 16
## 47 " VT" 10
## 48 " WA" 23
## 49 " WI" 20
## 50 " WV" 1
## 51 " WY" 4
Breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar() + ggtitle("Distribution of Breweries by State") + ylab(" # of Breweries") + geom_text(stat = "count", aes(label = after_stat(count)), vjust = 0) + theme(legend.position = "none")

#Heat Map of breweries in each state.
library(usmap)
StateBeerC = data.frame(state = c("AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"),values = c(7,3,2,11,39,47,8,1,2,15,7,4,5,5,18,22,3,4,5,23,7,9,32,12,9,2,9,19,1,5,3,3,4,2,16,15,6,29,25,5,4,1,3,28,4,16,10,23,20,1,4))
plot_usmap(data = StateBeerC, values = "values", regions = "state") + scale_fill_continuous(low = "yellow", high = "red", name = "Number of Breweries", label = scales::comma) + labs(title = "Number of Breweries By State", ) + theme(legend.position = "right")

#Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the #merged file.
Breweries$Brewery_id = Breweries$Brew_ID
Breweries <- Breweries %>% select(Brewery_id, Name, City, State)
BB <- merge(Beers,Breweries, by = "Brewery_id", all = TRUE)
dim(BB)
## [1] 2410 10
summary(BB)
## Brewery_id Name.x Beer_ID ABV
## Min. : 1.0 Length:2410 Min. : 1.0 Min. :0.00100
## 1st Qu.: 94.0 Class :character 1st Qu.: 808.2 1st Qu.:0.05000
## Median :206.0 Mode :character Median :1453.5 Median :0.05600
## Mean :232.7 Mean :1431.1 Mean :0.05977
## 3rd Qu.:367.0 3rd Qu.:2075.8 3rd Qu.:0.06700
## Max. :558.0 Max. :2692.0 Max. :0.12800
## NA's :62
## IBU Style Ounces Name.y
## Min. : 4.00 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 35.00 Mode :character Median :12.00 Mode :character
## Mean : 42.71 Mean :13.59
## 3rd Qu.: 64.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## NA's :1005
## City State
## Length:2410 Length:2410
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
head(BB)
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
tail(BB)
## Brewery_id Name.x Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Address the missing values in each column
#https://www.masterclass.com/articles/ibu-beer
#There are styles of beer and each style has a range of IBU, we could use the sytle of the beer to assign an average value
MeanIBU <- BB %>% filter(!is.na(IBU)) %>% group_by(Style) %>% dplyr::summarize(IBUMean = mean(IBU))
TestBB <- merge(BB,MeanIBU, by="Style")
TestBB$IBU[is.na(TestBB$IBU)] <- TestBB$IBUMean[is.na(TestBB$IBU)]
MeanABV <- BB %>% filter(!is.na(ABV)) %>% group_by(Style) %>% dplyr::summarize(ABVMean=mean(ABV))
TestBB1 <- merge(TestBB,MeanABV, by="Style")
TestBB1$ABV[is.na(TestBB1$ABV)] <- TestBB1$ABVMean[is.na(TestBB1$ABV)]
gg_miss_var(TestBB1) + ggtitle("Missing Values in Dataset")

#In order to find the missing values in ABV and IBU, we calculated the mean of each variable by Style and imputed #them into BB dataframe
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
TestBB %>% ggplot(aes(x = State, y = IBU, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median IBU of Breweries by State") + ylab("International Bitterness Unit") + theme(legend.position="none")

#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
TestBB1 %>% ggplot(aes(x = State, y = ABV, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median ABV of Breweries by State") + ylab("Alcohol By Volume") + theme(legend.position="none")

#Which state has the most bitter (IBU) beer?
TestBB %>% dplyr::summarise(max(IBU))
## max(IBU)
## 1 138
TestBB %>% filter(IBU == "138")
## Style Brewery_id Name.x Beer_ID
## 1 American Double / Imperial IPA 375 Bitter Bitch Imperial IPA 980
## ABV IBU Ounces Name.y City State IBUMean
## 1 0.082 138 12 Astoria Brewing Company Astoria OR 93.32
p <- BB %>% ggplot(aes(x=IBU, fill=State)) + geom_bar() + ggtitle("Finding State with Maximum IBU") + theme(legend.position="none") + ylab("Beers")
ggplotly(p)
## Warning: Removed 1005 rows containing non-finite values (`stat_count()`).
#The state with the most bitter (IBU) beer is Oregon.
#Which state has the maximum alcoholic (ABV) beer?
TestBB1 %>% dplyr::summarise(max(ABV))
## max(ABV)
## 1 0.128
TestBB1 %>% filter(ABV == "0.128")
## Style Brewery_id
## 1 Quadrupel (Quad) 52
## Name.x Beer_ID ABV IBU Ounces
## 1 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128 24 19.2
## Name.y City State IBUMean ABVMean
## 1 Upslope Brewing Company Boulder CO 24 0.104
p <- BB %>% ggplot(aes(x=ABV, fill=State)) + geom_bar() + ggtitle("Finding State with Maximum ABV") + theme(legend.position="none") + ylab("Beers")
ggplotly(p)
## Warning: Removed 62 rows containing non-finite values (`stat_count()`).
#The state with the maximum alcoholic (ABV) beer is Colorado.
#Distribution of ABV versus IBU
summary(TestBB1$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02700 0.05000 0.05600 0.05972 0.06700 0.12800
BB %>% filter(!is.na(ABV) & !is.na(IBU)) %>% select(ABV, IBU) %>% ggpairs()

BB %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(position="jitter") + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1005 rows containing missing values (`geom_point()`).

TestBB1 %>% select(ABV, IBU) %>% ggpairs()

TestBB1 %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(position="jitter") + geom_smooth() + ggtitle("Distribution of ABV versus IBU")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#Distribution of IBU versus ABV by Style (IPA and Ale)
library(dplyr)
IPA <- TestBB1 %>% filter(grepl('IPA', Style))
Ale <- TestBB1 %>% filter(grepl('Ale', Style))
head(IPA)
## Style Brewery_id Name.x Beer_ID
## 1 American Double / Imperial IPA 167 Gordon Imperial Red (2010) 806
## 2 American Double / Imperial IPA 215 More Cowbell 2123
## 3 American Double / Imperial IPA 504 Gordon (2005) 805
## 4 American Double / Imperial IPA 167 GUBNA Imperial IPA 6
## 5 American Double / Imperial IPA 375 Bitter Bitch Imperial IPA 980
## 6 American Double / Imperial IPA 175 MechaHopzilla 1114
## ABV IBU Ounces Name.y City State
## 1 0.087 85.00 12 Oskar Blues Brewery Longmont CO
## 2 0.090 118.00 16 Buffalo Bayou Brewing Company Houston TX
## 3 0.092 85.00 12 Oskar Blues Brewery Lyons CO
## 4 0.099 100.00 12 Oskar Blues Brewery Longmont CO
## 5 0.082 138.00 12 Astoria Brewing Company Astoria OR
## 6 0.088 93.32 16 New Orleans Lager & Ale Brewing ... New Orleans LA
## IBUMean ABVMean
## 1 93.32 0.08736893
## 2 93.32 0.08736893
## 3 93.32 0.08736893
## 4 93.32 0.08736893
## 5 93.32 0.08736893
## 6 93.32 0.08736893
head(Ale)
## Style Brewery_id Name.x Beer_ID
## 1 Abbey Single Ale 58 Abbey's Single (2015- ) 2505
## 2 Abbey Single Ale 58 Abbey's Single Ale (Current) 1887
## 3 American Amber / Red Ale 387 Colorado Red Ale 220
## 4 American Amber / Red Ale 61 Reprise Centennial Red 2288
## 5 American Amber / Red Ale 487 Wisco Disco 953
## 6 American Amber / Red Ale 202 Loki Red Ale 2191
## ABV IBU Ounces Name.y City State IBUMean
## 1 0.049 22.0000 12 Destihl Brewery Bloomington IL 22.0000
## 2 0.049 22.0000 12 Destihl Brewery Bloomington IL 22.0000
## 3 0.062 36.2987 12 Revolution Brewing Paonia CO 36.2987
## 4 0.060 36.2987 12 4 Hands Brewing Company Saint Louis MO 36.2987
## 5 0.051 31.0000 16 Stillmank Beer Company Green Bay WI 36.2987
## 6 0.075 53.0000 16 Fearless Brewing Company Estacada OR 36.2987
## ABVMean
## 1 0.049000
## 2 0.049000
## 3 0.057456
## 4 0.057456
## 5 0.057456
## 6 0.057456
BBIPAAle <- TestBB1 %>% filter(str_detect(Style, "IPA")|str_detect(Style, " Ale"))
BBIPAAle$AleType <- ifelse(str_detect(BBIPAAle$Style,"IPA"),"IPA","Ale")
BBIPAAle %>% ggplot(aes(x=ABV, y = IBU, color = AleType)) + geom_point() +ggtitle("Distribution of IBU versus ABV by Style (IPA and Ale)")

#Internal classification knn
classifications = knn.cv(BBIPAAle[,c(5,6)],BBIPAAle$AleType, prob = TRUE, k = 5)
table(classifications,BBIPAAle$AleType)
##
## classifications Ale IPA
## Ale 893 64
## IPA 69 507
CM = confusionMatrix(table(classifications,BBIPAAle$AleType))
CM
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 893 64
## IPA 69 507
##
## Accuracy : 0.9132
## 95% CI : (0.898, 0.9269)
## No Information Rate : 0.6275
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8147
##
## Mcnemar's Test P-Value : 0.7287
##
## Sensitivity : 0.9283
## Specificity : 0.8879
## Pos Pred Value : 0.9331
## Neg Pred Value : 0.8802
## Prevalence : 0.6275
## Detection Rate : 0.5825
## Detection Prevalence : 0.6243
## Balanced Accuracy : 0.9081
##
## 'Positive' Class : Ale
##
#Naive Bayes
splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##
## Ale IPA
## Ale 245 25
## IPA 35 155
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
##
##
## Ale IPA
## Ale 245 25
## IPA 35 155
##
## Accuracy : 0.8696
## 95% CI : (0.8353, 0.899)
## No Information Rate : 0.6087
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7289
##
## Mcnemar's Test P-Value : 0.2453
##
## Sensitivity : 0.8750
## Specificity : 0.8611
## Pos Pred Value : 0.9074
## Neg Pred Value : 0.8158
## Prevalence : 0.6087
## Detection Rate : 0.5326
## Detection Prevalence : 0.5870
## Balanced Accuracy : 0.8681
##
## 'Positive' Class : Ale
##
#External classification knn (train and test)
splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##
## Ale IPA
## Ale 255 20
## IPA 41 144
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
##
##
## Ale IPA
## Ale 255 20
## IPA 41 144
##
## Accuracy : 0.8674
## 95% CI : (0.8329, 0.897)
## No Information Rate : 0.6435
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.719
##
## Mcnemar's Test P-Value : 0.01045
##
## Sensitivity : 0.8615
## Specificity : 0.8780
## Pos Pred Value : 0.9273
## Neg Pred Value : 0.7784
## Prevalence : 0.6435
## Detection Rate : 0.5543
## Detection Prevalence : 0.5978
## Balanced Accuracy : 0.8698
##
## 'Positive' Class : Ale
##
#Separate dataset by beer style (IPA, Ale, Lager) and create bar chart.
BBClassify <- TestBB1
BBClassify$BeerType = BBClassify$AleType
BBClassify$BeerType = ifelse(str_detect(BBClassify$Style, "IPA"),"IPA",ifelse(str_detect(BBClassify$Style, "Stout"), "Stout",ifelse(str_detect(BBClassify$Style, "Pilsner"),"Pilsner",ifelse(str_detect(BBClassify$Style, "Beer"),"Beer",ifelse(str_detect(BBClassify$Style, " Ale"),"Ale",ifelse(str_detect(BBClassify$Style, "Lager"),"Lager","Other"))))))
BBClassify %>% filter(BeerType == "IPA" |BeerType == "Ale" |BeerType == "Lager") %>% ggplot(aes(x=IBU, y = ABV, color = BeerType)) + geom_jitter() + ggtitle("Distribution of IBU versus ABV by Style (IPA, Ale, Lager)")

BBClassify %>% ggplot(aes(x=BeerType, fill= BeerType)) + geom_bar() + ggtitle("Distribution of Beers by Beer Type") + ylab("Beers") + xlab("Beer Type")
